---
title: "R Notebook"
output:
  html_document:
    df_print: paged
---



#Data

Loading data from the UNGDC data


```{r message=FALSE}
#Loading packages and data
library(readtext)
library(quanteda)
library(rworldmap)
library(RColorBrewer)
library(haven)
library(readxl)

library(tidyverse)
library(tidymodels)
library(rsample)

# Modelling packages
library(caret)
library(caretEnsemble)
library(earth)
library(xgboost)
library(ranger)
library(rpart)
library(rpart.plot)

# Model interpretability packages
library(vip)
library(pdp)
library(lime)
library(jtools)
```




```{r}
qog_data <- read_csv("qog_std_ts_jan20.csv") 

#defunct countries that QoG kept in the dataset
crazy_qog <- c(993, 280, 230, 991, 992, 997, 736, 998, 886)

qog_data <- qog_data %>% filter(!(ccode %in% crazy_qog))

```



```{r}
model_data <- qog_data %>% 
  select(year, country=ccodealp, wdi_pop, wdi_lifexp, wdi_mortinf ,  ihme_hle_0104t , wdi_chexppgdp ,
                                                    wdi_lrmd  ,  wdi_co2 ,  wdi_fossil , ef_carb ,  
                                                    wdi_oilrent ,  wdi_enerenew , wdi_acel ,  lp_lat_abst ,  
                                                    fh_ipolity2 , wdi_gdpcapcon2010 ,  wdi_gdpgr ,   wdi_trade ) %>%  
  filter(year>1999) %>% 
  arrange(country, year) 

#model_data <- model_data %>% filter(!is.na(ht_region))

#model_data$ht_region <- factor(model_data$ht_region)

```



```{r}
library(janitor)

model_data %>% get_dupes(country, year)
```






# visualise missing data 
```{r}

model_data %>% 
  is.na() %>%
  reshape2::melt() %>%
  ggplot(aes(Var2, Var1, fill=value)) + 
    geom_raster() + 
    coord_flip() +
    scale_y_continuous(NULL, expand = c(0, 0)) +
    scale_fill_grey(name = "", 
                    labels = c("Present", 
                               "Missing")) +
    xlab("Observation") +
    theme(axis.text.y  = element_text(size = 6))

ggsave("missing_data.pdf")
```



```{r}
# impute missing data
imputed_covariates <- recipe(~ ., model_data) %>%
  step_bagimpute(wdi_pop, wdi_lifexp, wdi_mortinf ,  ihme_hle_0104t , wdi_chexppgdp ,
                                                    wdi_lrmd  ,  wdi_co2 ,  wdi_fossil , ef_carb ,  
                                                    wdi_oilrent ,  wdi_enerenew , wdi_acel ,  lp_lat_abst ,  
                                                    fh_ipolity2 , wdi_gdpcapcon2010 ,  wdi_gdpgr ,   wdi_trade) %>%
  prep(data= model_data) %>% 
  bake(model_data)

```


```{r}
# visualise new missing data 
imputed_covariates %>% 
  is.na() %>%
  reshape2::melt() %>%
  ggplot(aes(Var2, Var1, fill=value)) + 
    geom_raster() + 
    coord_flip() +
    scale_y_continuous(NULL, expand = c(0, 0)) +
    scale_fill_grey(name = "", 
                    labels = c("Present", 
                               "Missing")) +
    xlab("Observation") +
    theme(axis.text.y  = element_text(size = 6))
ggsave("missing_data_imputed.pdf")
```



```{r}
model_data_imputed <- imputed_covariates %>% select(year, country,  wdi_pop, wdi_lifexp, wdi_mortinf ,  ihme_hle_0104t , wdi_chexppgdp ,
                                                    wdi_lrmd  ,  wdi_co2 ,  wdi_fossil , ef_carb ,  
                                                    wdi_oilrent ,  wdi_enerenew , wdi_acel ,  lp_lat_abst , 
                                                    fh_ipolity2 , wdi_gdpcapcon2010 ,  wdi_gdpgr ,   wdi_trade )

model_data_imputed$wdi_pop <-  log(model_data_imputed$wdi_pop)
```



```{r}
sids_countries <- c("ASM","AIA","ATG","ABW","BHS","BHR","BRB","BLZ","BMU","CPV","COM","COK","CUB","DMA","DOM","FJI","GRD","GUM","GNB","GUY","HTI","JAM","KIR","MDV","MHL","MUS","FSM","MSR","NRU","NCL","NIU","PLW","PNG","PRI","KNA","LCA","VCT","WSM","STP","SYC","SGP","SLB","SUR","TLS","TON","TTO","TUV","VUT")


eu <- c("BEL", "FRA", "DEU", "ITA", "LUX", "NLD", "DNK", "IRL", "GBR", "GRC", "ESP", "PRT", "AUT", "FIN", "SWE", "CZE", "HUN", "POL", "EST", "LTU", "CYP", "MLT", "SVK", "SVN", "BGR", "ROU", "HRV")

g77 <- c("AFG", "DZA", "ARG", "BEN", "BOL", "BRA", "BFA", "BDI", "KHM", "CMR", "CAF", "TCD", "CHL", "COL", "COG", "COD", "CRI", "DOM", "ECU", "EGY", "SLV", "ETH", "GAB", "GTM", "GHA", "GIN", "HTI", "HND", "IND", "IDN", "IRN", "IRQ", "JAM", "JOR", "KEN", "KWT", "LAO", "LBN", "LBR", "LBY", "MDG", "MYS", "MLI", "MRT", "MAR", "MMR", "NPL", "NIC", "NER", "NGA", "PAK", "PAN", "PRY", "PER", "PHL", "RWA", "SAU", "SEN", "SLE", "SOM", "LKA", "SDN", "SYR", "TZA", "THA", "TGO", "TTO", "TUN", "UGA", "URY", "VEN", "VNM", "YEM", "AGO", "ATG", "AZE", "BHS", "BHR", "BGD", "BRB", "BLZ", "BTN", "BIH", "BWA", "BRN", "CPV", "CHN", "COM", "CIV", "CUB", "DJI", "DMA", "GNQ", "ERI", "SWZ", "FJI", "GMB", "GRD", "GNB", "GUY", "KIR", "LSO", "MWI", "MDV", "MHL", "MUS", "FSM", "MNG", "MOZ", "NAM", "PRK", "NRU", "OMN", "PSE", "PNG", "QAT", "KNA", "LCA", "VCT", "WSM", "STP", "SYC", "SGP", "SLB", "ZAF", "SSD", "SUR", "TJK", "TLS", "TON", "TKM", "ARE", "VUT", "ZMB", "ZWE")

africa_group <- c("DZA", "BEN", "BFA", "BDI", "CMR", "CAF", "TCD", "COG", "COD", "EGY", "ETH", "GAB", "GHA", "GIN", "KEN", "LBR", "LBY", "MDG", "MLI", "MRT", "MAR", "NER", "NGA", "RWA", "SEN", "SLE", "SOM", "SDN", "TZA", "TGO", "TUN", "UGA", "AGO", "BWA", "CPV", "COM", "CIV", "DJI", "GNQ", "ERI", "SWZ", "GMB", "GNB", "LSO", "MWI", "MUS", "MOZ", "NAM", "STP", "SYC", "ZAF", "SSD", "ZMB", "ZWE")

arab_states <- c("DZA", "EGY", "IRQ", "JOR", "KWT", "LBN", "LBY", "MRT", "MAR", "SAU",  "SOM", "SDN", "SYR", "TUN", "YEM", "BHR", "COM", "DJI", "OMN", "PSE", "QAT", "ARE")

eig <- c("MEX", "LIE", "MCO", "KOR", "CHE", "GEO")

umbrella <- c("AUS", "BLR", "CAN", "ISL", "ISR", "JPN", "NZL", "KAZ", "NOR", "RUS", "UKR", "USA")

ldc <- c("AFG", "BEN", "BFA", "BDI", "KHM", "CAF", "TCD", "COD", "ETH", "GIN", "HTI", "LAO", "LBR", "MDG", "MLI", "MRT", "MMR", "NPL", "NER", "YEM", "AGO", "BGD", "BTN", "COM", "DJI", "ERI", "GMB", "GNB", "LSO", "MWI", "MOZ", "SLB", "TLS", "VUT", "KIR", "TUV", "RWA", "STP", "SEN", "SLE", "SOM", "SSD", "SDN", "TZA", "TGO", "UGA", "ZMB")

model_data_imputed$sids <- factor(ifelse(model_data_imputed$country %in% sids_countries, "SIDS", "Not SIDS"))

model_data_imputed$eu <- factor(ifelse(model_data_imputed$country %in% eu, "EU", "Not EU"))

model_data_imputed$g77 <- factor(ifelse(model_data_imputed$country %in% g77, "G77", "Not G77"))

model_data_imputed$africa <- factor(ifelse(model_data_imputed$country %in% africa_group, "Africa Group", "Not Africa Group"))

model_data_imputed$arab <- factor(ifelse(model_data_imputed$country %in% arab_states, "Arab States", "Not Arab States"))

model_data_imputed$eig <- factor(ifelse(model_data_imputed$country %in% eig, "EIG", "Not EIG"))

model_data_imputed$umbrella <- factor(ifelse(model_data_imputed$country %in% umbrella, "Umbrella", "Not Umbrella"))

model_data_imputed$ldc <- factor(ifelse(model_data_imputed$country %in% ldc, "LDC", "Not LDC"))


```




```{r}
model_data_imputed <- model_data_imputed %>% 
  rename(Population = wdi_pop, 
         Life_Expectancy = wdi_lifexp, 
         Infant_Mortality = wdi_mortinf, 
         Healthy_Life_Years = ihme_hle_0104t, 
         Health_Expenditure = wdi_chexppgdp, 
         Maternal_Death = wdi_lrmd, 
         CO2_Emissions = wdi_co2, 
         Fossil_Fuel_Energy_Consumption = wdi_fossil, 
         Carbon_Footprint = ef_carb, 
         Oil_Rents_pcGDP = wdi_oilrent, 
         Renewable_Energy_Consumption = wdi_enerenew, 
         Access_Electricity = wdi_acel, 
         Latitude = lp_lat_abst, 
         Democracy = fh_ipolity2, 
         GDPpc = wdi_gdpcapcon2010, 
         Economic_Growth = wdi_gdpgr, 
         Trade_pcGDP = wdi_trade, 
         SIDS = sids, 
         EU = eu, 
         G77 = g77, 
         African_Group = africa, 
         Arab_States = arab, 
         EIG = eig, 
         Umbrella_Group = umbrella, 
         LDCs = ldc)
```





```{r}
# visualise new missing data 
model_data_imputed %>% 
  is.na() %>%
  reshape2::melt() %>%
  ggplot(aes(Var2, Var1, fill=value)) + 
    geom_raster() + 
    coord_flip() +
    scale_y_continuous(NULL, expand = c(0, 0)) +
    scale_fill_grey(name = "", 
                    labels = c("Present", 
                               "Missing")) +
    xlab("Observation") +
    theme(axis.text.y  = element_text(size = 6))
```


```{r}
readr::write_csv(model_data_imputed, "model_data_imputed.csv")
```
